

#########################################################################
###############-------诊断预测模型R语言实战----------####################
##############################松哥统计###################################


################################Part one-诊断预测模型实战##############
#设置工作路径
getwd()
setwd("F:/Rproject/baseline")


#变量理解
#hypoglycemia                   0=no         1=yes
#dataset                        0=val        1=dev
#course_of_disease              0="<=10year"  1=">10year"
#hypertension                   0=no         1=yes
#Hyperlipidemia                 0=no         1=yes
#marital_status                 1=结婚  2=单身  3=离异   4=丧偶
#Cerebrovascular_diseases       0=no         1=yes
#Treat_Time                     0=“<3month”         1=“>=3month”
#History_of_hypoglycemia        0=no         1=yes
#DN                             0=no         1=yes
#DPVD                           0=no         1=yes
#education                      1=小学及以下 2=中学    3=大专及以上
#Cardiovascular                 0=no         1=yes
#gender                         0=male       1=female
#Insulin_injection_protocol     0=基础餐时   1=CSII
#drink_wine                     0=no         1=yes
#Fatty_liver                    0=no         1=yes
#PNP                            0=no         1=yes
#LDLC                           0=no         1=yes
#TG                             0=no         1=yes
#HDLC                           0=no         1=yes
#ALT                            0=no         1=yes
#AST                            0=no         1=yes
#Cr                             0=no         1=yes
#INS                            0=no         1=yes
#C_peptide                      0=no         1=yes
#age                            0="<55"      1=">=55"
#age1
#BUN                            0=no         1=yes
#UA                             0=no         1=yes
#RBG                            0=no         1=yes
#GRF                            0=no         1=yes
#SBP                            0=no         1=yes
#DBP                            0=no         1=yes
#FBG                            0=no         1=yes
#HbA1c                          0=no         1=yes
#TC                             0=no         1=yes
#BMI                            0=消瘦       1=正常     2超重
#CRP                            0=no         1=yes


##一、数据准备
#1.数据读取（读进来）
#install.packages("readr")
library(readr)
mydata <- read_csv("hypoglycemia_end.csv")

#删除有缺失值的行
mydata<-na.omit(mydata)
View(mydata)

names(mydata)

#必须做，了解变量类型非常重要
str(mydata)


#attach(mydata)


head(mydata)
#head(mydata,6)
--------------------------------
  
  
  #2.数据准备
  #2.1数值变量准备
  #2.1.1数据摘要
  
summary(mydata)
summary(mydata)

#2.2分类变量准备(必做的准备)
##2.2.1数字型的变量指定为分类变量

mydata$marital_status <-   
  factor(mydata$marital_status,  
         levels = c(1,2,3,4),  
         labels = c("married", "unmarried","divorce","widow")) 


# gcr 测试
colnamelist <- colnames(mydata)
colnamelist
mydata[,'']
# which((mydata$LDLC==0))
# 用于判断list里面有没有前面参数
'marital_status' %in% colnamelist



str(mydata)   #查看变量属性，重点看数值（num）与分类（factor）

table(mydata$marital_status)

#如果对于无序多分类，要设定哑变量可以用；改变因子水平次序 relevel()函数
#分类变量的label和level指定，可以在最后模型中安排
#rawdata$DM<-factor(rawdata$DM,labels=c('non.DM','DM'))
#relevel(rawdata$DM,ref= 'DM')


##字符型数值变量转换为数值变量（就是上面步骤的逆过程）
#mydata$marital_status<-as.numeric(mydata$marital_status)
#str(mydata)



###########################################
##########拆分数据方法#####################
#1.随机按比例拆分文件
#install.packages("caret")
# library(caret)
# set.seed(123)
# trianandvad<- createDataPartition(y=mydata$ID,p=0.70,list=FALSE)
# train <- mydata[trianandvad, ]
# vadi<-mydata[-trianandvad,] 
# write.csv(train, "train.csv")
# write.csv(vadi, "vadi.csv")
##########################################


##########################################
#2:数据集指定（如果数据集中已经有数据集分组变量，可以如下指定即可）
dev = mydata[mydata$dataset==1,]
vad = mydata[mydata$dataset==0,]
##########################################



##二、模型构建
#2.1传统先单后多：单因素Logsitic回归分析
# gcr 直接拿 hypoglycemia = 1的是y 后面跟着需要分析变量 这里是病程 后面family 是指明广义线性回归的二项逻辑回归
M1<-glm(hypoglycemia==1~ mydata$course_of_disease,data=mydata,family=binomial)

M1
summary(M1)

#看模型的系数及95%CI
cbind(coef= coef(M1),confint(M1))
#看模型的OR及95%CI
exp(cbind(OR= coef(M1),confint(M1)))


M2<-glm(hypoglycemia==1~ mydata$hypertension,data=mydata,family=binomial)
M2
summary(M2)

M3<-glm(hypoglycemia==1~ mydata$Hyperlipidemia,data=mydata,family=binomial)
M3
summary(M3)

#多因素分析
MM<-glm(hypoglycemia==1~ mydata$Hyperlipidemia+mydata$hypertension+
          mydata$course_of_disease,data=mydata,family=binomial)
MM
summary(MM)

#看模型的系数及95%CI
cbind(coef= coef(MM),confint(MM))
#看模型的OR及95%CI
exp(cbind(OR= coef(MM),confint(MM)))


#2.2单因素Logistic回归（批量执行）

uni_glm_model<-function(x){
  FML<-as.formula(paste0("hypoglycemia==1~",x))  #hypoglycemia==1，您以后就改绿色的变量即可
  glm1<-glm(FML,data = dev,family = binomial)
  glm2<-summary(glm1)
  
  #计算我们所要的指标
  OR<-round(exp(coef(glm1)),2)
  SE<-round(glm2$coefficients[,2],3)  
  CI2.5<-round(exp(coef(glm1)-1.96*SE),2)
  CI97.5<-round(exp(coef(glm1)+1.96*SE),2)
  CI<-paste0(CI2.5,'-',CI97.5)
  B<-round(glm2$coefficients[,1],3)
  Z<-round(glm2$coefficients[,3],3)
  P<-round(glm2$coefficients[,4],3)
  
  #将计算出来的指标制作为数据框
  uni_glm_model<-data.frame('characteristics'=x,
                            'B'=B,
                            'SE'=SE,
                            'OR'=OR,
                            'CI'=CI,
                            'Z' =Z,
                            'P'=P)[-1,]
  
  return(uni_glm_model)
}

#指定参与分析的若干自变量X
variable.names<-colnames(dev)[c(4:39)]  #要核实这里的X对应的列是否对？若分开的可以这样[c(3:18,20:40)]
variable.names

#运行上面自定义批量执行函数
uni_glm<-lapply(variable.names,uni_glm_model)
uni_glm


#install.packages("plyr")
library(plyr)

#生成单变量分析的综合结果
uni_glm<-ldply(uni_glm,data.frame)

#看下结果是啥样子的
uni_glm

View(uni_glm)


#将单因素分析的结果，写到csv中.
write.csv(uni_glm, "uni.csv")


#将P<0.05的结果挑选出来（如需）
uni_glm1 <- uni_glm[uni_glm$P<= 0.05,]
uni_glm1

#将P<0.1的结果挑选出来（如需）
uni_glm2 <- uni_glm[uni_glm$P<= 0.1,]
uni_glm2

#直接将P<0.05的变量的characteristics提取出来
uni_glm$characteristics[uni_glm$P<= 0.05]


#将P<0.05的结果，写到csv中（如需).
write.csv(uni_glm1, "p5.csv")
write.csv(uni_glm2, "p10.csv")


########################2.3多因素分析#####################
#1.多因素-enter法 
#先写一个公式，方便后面重复使用
fml<- as.formula(paste0('hypoglycemia==1~',paste0(uni_glm$characteristics[uni_glm$P<0.1],collapse = '+'))) #P<0.05也是可以的
fml

fml<-as.formula(hypoglycemia == 1 ~ course_of_disease + Hyperlipidemia + Treat_Time + 
                  Education + gender + BUN + RBG + TC)
fml

##1.1多因素enter回归
modelA<-glm(fml,data = dev,family=binomial)

modelA             #只能拿到模型的系数

summary(modelA)    #可以拿到模型概要




#2.多因素—forward
#modelB<-step(modelA,direction="forward")#这种表示的前进法不行.要如下操作
modelX<-glm(hypoglycemia~1,data = dev,family=binomial)
modelX

#modelB中的变量可以用前面的fml的结果复制过来
modelB<-step(modelX,scope=list(upper=~ course_of_disease + Hyperlipidemia + 
                                 Treat_Time + Education + gender + BUN + RBG + TC,
                               lower=~1),data = dev,family=binomial,direction ="forward")

summary(modelB)


#3.多因素-backward
modelC<-step(modelA,direction ="backward")
summary(modelC)

#4.多因素-both
modelD<-step(modelA,direction = "both")
summary(modelD)


#看模型的系数及95%CI
cbind(coef=coef(modelD),confint(modelD))

#看模型的OR及95%CI
exp(cbind(OR=coef(modelD),confint(modelD)))


#模型AIC
AIC(modelA,modelB,modelC,modelD)



#模型AIC比较
anova(modelA,modelB,test = "Chisq")
anova(modelA,modelC,test = "Chisq")
anova(modelA,modelD,test = "Chisq")
anova(modelB,modelC,test = "Chisq")
anova(modelB,modelD,test = "Chisq")
anova(modelC,modelD,test = "Chisq")
# test= 的选项可以后面选择"Rao","LRT" , "Chisq","F","Cp"
#注意ModelB，可以把模型B最终结果，再跑一次enter，然后比较就没有难问题了

###确定最终模型
#结果上述比较，决定采用modelD的结果

modelD<-step(modelA,direction = "both")
modelD
glm3<-summary(modelD)
glm3


#将多因素分析结果制备为发表格式
glm3$coefficients

OR<-round(exp(glm3$coefficients[,1]),2)
OR


SE<-round(glm3$coefficients[,2],3)
CI2.5<-round(exp(coef(modelD)-1.96*SE),2)
CI97.5<-round(exp(coef(modelD)+1.96*SE),2)
CI<-paste0(CI2.5,'-',CI97.5)
B<-round(glm3$coefficients[,1],3)
Z<-round(glm3$coefficients[,3],3)
P<-round(glm3$coefficients[,4],3)




#制作数据框#'characteristics'=multiname,
mlogit<-data.frame( 
  'B'=B,
  'SE'=SE,
  'OR'=OR,
  'CI'=CI,
  'Z' =Z,
  'P'=P)[-1,]   #-1是指删除常数项
mlogit

#删除第一行常数项，如果上面没有[-1,]，则会产生常数项，导致不能合并
#mlogit<-mlogit[-1,] 

View(mlogit)


#展示一下数据库变量列表
names(mydata)

fml

#hypoglycemia == 1 ~ course_of_disease + Hyperlipidemia + Treat_Time + 
#Education + BUN + RBG + TC
#提取最终模型，多因素分析有意义变量
multinames<-as.character(colnames(mydata)[c(4,6,9,13,30,32,38)])

#检验是否正确
multinames


#
mlogit<-data.frame('characteristics'=multinames,mlogit)

mlogit

View(mlogit)

#将多因素分析结果写入csv
write.csv(mlogit, "multi.csv")

##########至此先单后多分析完毕!!######################

#合并先单后多分析表格
final<-merge.data.frame(uni_glm,mlogit,by='characteristics',all = T,sort = T)


#展示结果
final
View(final)

#将结果写入CSV
write.csv(final, "final.csv")

###############################至此，先单后多完美结束####################################
#----------------------------------------------------------------------------------------

##########################Part TWO.模型验证#############################################
#1.模型区分度分析
#1.1训练集区分度AUC
#放了方便大家理解，我们采用两个模型进行后续演示，分别为全8因子模型和7因子模型

fml8<-as.formula(hypoglycemia == 1 ~ course_of_disease + Hyperlipidemia + Treat_Time + 
                   Education + gender + BUN + RBG + TC)
fml7<-as.formula(hypoglycemia == 1 ~ course_of_disease + Hyperlipidemia + Treat_Time + 
                   Education + BUN + RBG + TC)


model8<-glm(fml8,data = dev,family = binomial(logit))

model7<-glm(fml7,data = dev,family = binomial(logit))


#在建模人群中计算预测值
dev$predmodel8<- predict(newdata=dev,model8,"response")
dev$predmodel7<- predict(newdata=dev,model7,"response")
View(dev)

#在验证人群计算预测值
vad$predmodel8<- predict(newdata=vad,model8,"response")
vad$predmodel7<- predict(newdata=vad,model7,"response")
View(vad)


#构建模型，最终也就是为了得到预测的P值，有P值就有预测模型的一切啦！

#绘制ROC曲线,先安装pROC包

#install.packages("pROC")

library(pROC)

#在建模人群中绘制分别二条ROC曲线并给出阈值和ROC曲线下面积。
#建模集，model8的auc与roc分析
devmodelA <- roc(hypoglycemia~predmodel8, data = dev,smooth=F)
#devmodelA <- roc(hypoglycemia~predmodel8, data = dev,smooth=T)  #如果做平滑，建议不做
devmodelA

round(auc(devmodelA),3)
round(ci(auc(devmodelA)),3)


#ROC画图方法一(灵敏度与特异度ROC，非标准；但会给出AUC，阈值及对应的灵敏度和特异度)
plot(devmodelA, print.auc=TRUE, print.thres=TRUE,main = "ROC CURVE", 
     col= "blue",print.thres.col="blue",identity.col="blue",
     identity.lty=1,identity.lwd=1)


################################################################################

##建模集，model7的auc与roc分析
devmodelB <- roc(hypoglycemia~predmodel7, data = dev,smooth=F)
round(auc(devmodelB),3)
round(ci(auc(devmodelB)),3)


#ROC画图方法一：
plot(devmodelB, print.auc=TRUE, print.thres=TRUE,main = "ROC CURVE", col= "red",print.thres.col="red",identity.col="red",
     identity.lty=1,identity.lwd=1)


#################################################################
#在验证人群中分别绘制二条ROC曲线并给出阈值和ROC曲线下面积。


#####################################
#######验证集8因子模型roc############

vadmodelA <- roc(hypoglycemia~predmodel8, data = vad,smooth=F)
round(auc(vadmodelA),3)
round(ci(auc(vadmodelA)),3)

#ROC画图方法一
plot(vadmodelA, print.auc=TRUE, print.thres=TRUE,main = "ROC CURVE", col= "blue",print.thres.col="blue",identity.col="blue",
     identity.lty=1,identity.lwd=1)


#####################################
#######验证集7因子模型roc############

vadmodelB <- roc(hypoglycemia~predmodel7, data = vad,smooth=F)

#ROC画图方法一
plot(vadmodelB, print.auc=TRUE, print.thres=TRUE,main = "ROC CURVE", col= "red",print.thres.col="red",identity.col="red",
     identity.lty=1,identity.lwd=1)


#############################################
#######将二条ROC曲线放在一个图形中###########
#############################################
library(pROC)


#######################
#建模人群多条ROC方法一
devroc1 <- plot.roc(dev$hypoglycemia, dev$predmodel8, main="dev ROC", percent=TRUE, col="1")
devroc2 <- lines.roc(dev$hypoglycemia, dev$predmodel7, percent=TRUE, col="2")

legend("bottomright", legend=c("devmodel8", "devmodel7"), col=c("1", "2"), lwd=2)


####-------------------------------------------
#验证人群多条ROC方法一
vadroc1 <- plot.roc(vad$hypoglycemia, vad$predmodel8, main="vad ROC", percent=TRUE, col="1")
vadroc2 <- lines.roc(vad$hypoglycemia, vad$predmodel7, percent=TRUE, col="2")

legend("bottomright", legend=c("vadmodelA", "vadmodelB"), col=c("1", "2"), lwd=2)



##################################################
#两条曲线进行统计学比较
##################################################
#建模人群ROC比较

#devroc1和devroc2
roc.test(devroc1,devroc2)


#验证人群ROC比较
#vadroc1和vadroc2
roc.test(vadroc1,vadroc2)


###################################################
###8.校准度分析（只要有预测概率和目标Y即可）    ###
###################################################


###########################################
#######校准曲线实现方法一
###########################################
#install.packages("calibrate")
library(calibrate)
library(MASS)
#install.packages("rms")
library(rms)


#在建模人群中绘制Calibration plot
val.prob(dev$predmodel8,dev$hypoglycemia)
val.prob(dev$predmodel7,dev$hypoglycemia)


#在验证人群中绘制Calibration plot
val.prob(vad$predmodel8,vad$hypoglycemia)
val.prob(vad$predmodel7,vad$hypoglycemia)


#在建模人群中进行Hosmer-Lemeshow test检验
source("HLtest.R") #一定要把HLtest.R放在之前设定的起始目录中。
hl.ext2(dev$predmodel8,dev$hypoglycemia)
hl.ext2(dev$predmodel7,dev$hypoglycemia)


#在验证人群中进行Hosmer-Lemeshow test检验
source("HLtest.R") #一定要把HLtest.R放在之前设定的起始目录中。
hl.ext2(vad$predmodel8,vad$hypoglycemia)
hl.ext2(vad$predmodel7,vad$hypoglycemia)


####################################################
##校准曲线实现方法二：（Bootstrap方法)
####################################################
#install.packages("riskRegression")
library(riskRegression)


formula<-hypoglycemia == 1 ~course_of_disease + Hyperlipidemia + Treat_Time + 
  Education + BUN + RBG + TC

#在建模集中制作校准曲线
fit1<-glm(formula,data=dev,family = binomial())
xb<-Score(list("fit"=fit1),formula=hypoglycemia~1,
          null.model=FALSE,
          plots=c("calibration","ROC"),
          metrics=c("auc","brier"),
          B=1000,M=50,
          data=dev)
plotCalibration(xb,col="red")

#在验证集中制作校准曲线
fit2<-glm(fit1,data=vad,family = binomial())
xb<-Score(list("fit"=fit1),formula=hypoglycemia~1,
          null.model=FALSE,
          plots=c("calibration","ROC"),
          metrics=c("auc","brier"),
          B=1000,M=50,
          data=vad)
plotCalibration(xb,col="red")


#############################################################
#9.临床决策曲线分析                        ####
#############################################################
library(readr)
mydata <- read_csv("hypoglycemia_end.csv")

#删除有缺失值的行
mydata<-na.omit(mydata)

#数据集指定（如果数据集中已经有数据集分组变量，可以如下指定即可）
dev = mydata[mydata$dataset==1,]
vad = mydata[mydata$dataset==0,]


#################################################
#######方法一：rmda验证完成
#################################################
#install.packages("rmda")
library(rmda)

#http://mdbrown.github.io/rmda/


###绘制DCA曲线
#拟合模型（松哥提醒，注意model1and2,是否可以bootstarp，因为后面验证集的可以，试试）
#model_1为8因子模型
model_1<-decision_curve(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                          Education + BUN + gender+ RBG + TC,
                        data = dev,
                        family = binomial(logit),
                        thresholds = seq(0,1,by=0.01),
                        confidence.intervals = 0.95,
                        study.design = 'case-control',
                        population.prevalence =0.3)


#model_1为7因子模型
model_2<-decision_curve(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                          Education + BUN + RBG + TC,
                        data = dev,
                        family = binomial(logit),
                        thresholds = seq(0,1,by=0.01),
                        confidence.intervals = 0.95,
                        study.design = 'case-control',
                        population.prevalence =0.3)


#绘制曲线
plot_decision_curve(model_1,curve.names = c('model_1'),
                    xlim = c(0,0.8),
                    cost.benefit.axis = FALSE,
                    col = c('red'),
                    confidence.intervals = FALSE,
                    standardize = FALSE)


plot_decision_curve(model_2,curve.names = c('model_2'),
                    xlim = c(0,0.8),
                    cost.benefit.axis = FALSE,
                    col = c('green'),
                    confidence.intervals = FALSE,    #TRUE显示可信区间
                    standardize = FALSE)

#绘制多条曲线
plot_decision_curve( list(model_1, model_2), 
                     curve.names = c("model_1", "model_2"),
                     col = c("blue", "red"), 
                     confidence.intervals = FALSE,  #remove confidence intervals
                     cost.benefit.axis = FALSE, #remove cost benefit axis
                     legend.position = "topright") #remove the legend "bottomright""none""topright"
#添加可信区间
plot_decision_curve( list(model_1, model_2), 
                     curve.names = c("model_1", "model_2"),
                     col = c("blue", "red"), 
                     confidence.intervals = TRUE,  #confidence intervals
                     cost.benefit.axis = FALSE, #remove cost benefit axis
                     legend.position = "topright") #add the legend "bottomright" "topright" "none"



##验证集决策曲线，需要先生成验证集的预测概率
fml8<-as.formula(hypoglycemia == 1 ~ course_of_disease + Hyperlipidemia + Treat_Time + 
                   Education + gender + BUN + RBG + TC)
fml7<-as.formula(hypoglycemia == 1 ~ course_of_disease + Hyperlipidemia + Treat_Time + 
                   Education + BUN + RBG + TC)


model8<-glm(fml8,data = dev,family = binomial(logit))

model7<-glm(fml7,data = dev,family = binomial(logit))


#在建模人群中计算预测值
dev$predmodel8<- predict(newdata=dev,model8,"response")
dev$predmodel7<- predict(newdata=dev,model7,"response")


#在验证人群计算预测值
vad$predmodel8<- predict(newdata=vad,model8,"response")
vad$predmodel7<- predict(newdata=vad,model7,"response")



#8因子模型
vadmodel8 <- decision_curve(hypoglycemia~predmodel8,
                            data = vad,
                            fitted.risk = TRUE, 
                            thresholds = seq(0, .9, by = .05),
                            bootstraps = 200) 

plot_decision_curve(vadmodel8,curve.names = c('model_1'),
                    legend.position = "topright",
                    confidence.intervals = FALSE,    #remove confidence intervals)
                    standardize = FALSE) 


plot_decision_curve(vadmodel8,curve.names = c('model_1'),
                    legend.position = "topright",
                    confidence.intervals = TRUE,    #remove confidence intervals)
                    standardize = FALSE) 

#7因子模型
vadmodel7 <- decision_curve(hypoglycemia~predmodel7,
                            data = vad,
                            fitted.risk = TRUE, 
                            thresholds = seq(0, .9, by = .05),
                            bootstraps = 200) 

plot_decision_curve(vadmodel7, legend.position = "topright",
                    confidence.intervals = FALSE,
                    standardize = FALSE)  #remove confidence intervals

#绘制多条曲线
plot_decision_curve( list(vadmodel8, vadmodel7), 
                     curve.names = c("model_1", "model_2"),
                     col = c("blue", "red"), 
                     confidence.intervals = FALSE,  #remove confidence intervals
                     cost.benefit.axis = FALSE, #remove cost benefit axis
                     legend.position = "topright") #remove the legend "bottomright" "topright" "none"


###################################################################################
#Decision Curve Analysis(DCA第二种方法)
##############################################################
##########决策曲线第二种方法(dca.R)-succeed
#https://www.mskcc.org/departments/epidemiology-biostatistics/biostatistics/decision-curve-analysis

library(readr)
mydata <- read_csv("hypoglycemia_end.csv")


dev = mydata[mydata$dataset==1,]
vad = mydata[mydata$dataset==0,]

source("dca.R")
#install.packages("nricens")

library(rms)
library(foreign)
library(nricens)

#构建2个回归模型用于演示
##modelA为8因素模型
modelA<-glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
              Education + BUN + gender+ RBG + TC, data = dev, family = binomial(link="logit"),x=TRUE)
summary(modelA)

#用modelA预测概率（建模集和验证集)
dev$predmodelA<- predict(newdata=dev,modelA,"response")
vad$predmodelA<- predict(newdata=vad,modelA,"response")

#is.data.frame(dev)
#View(dev)
View(vad)

#modelD为7因素最优模型
modelD <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                Education + BUN + RBG + TC, data = dev, family = binomial(link="logit"),x=TRUE)
summary(modelD)

#用modelD预测概率（建模集和验证集)
dev$predmodelD<- predict(newdata=dev,modelD,"response")
vad$predmodelD<- predict(newdata=vad,modelD,"response")

View(dev)

#训练集dca
dev<-as.data.frame(dev)
dca(data=dev, outcome="hypoglycemia",
    predictors=c("predmodelA","predmodelD"),
    smooth="TRUE", probability=c("TRUE","TRUE"),
    xstop=0.5) 


#验证集dca
vad<-as.data.frame(vad)
dca(data=vad, outcome="hypoglycemia", 
    predictors=c("predmodelA","predmodelD"),
    smooth="TRUE", probability=c("TRUE","TRUE"),
    xstop=0.5) 

#可以直接添加自变量predictors，如下面加入TC
dca(data=vad, outcome="hypoglycemia", 
    predictors=c("predmodelA","predmodelD","TC"),
    smooth="TRUE", probability=c("TRUE","TRUE","TRUE"),
    xstop=0.5) 

##############################################################
#----------------dca.R帮助文档---注意可以直接放预测因子-------
##############################################################
library(MASS)
data.set <- birthwt
#data.set
View(data.set)

model = glm(low ~ age + lwt, family=binomial(link="logit"), data=data.set)
data.set$predlow = predict(model, type="response")

#Decision Curve Analysis
dca(data=data.set, outcome="low", predictors="predlow", smooth="TRUE", xstop=0.50)               #predictors为预测概率

dca(data=data.set, outcome="low", predictors=c("age", "lwt"), probability=c("FALSE", "FALSE"))   #predictors为方程中的变量，而且可以多个

dca(data=data.set, outcome="low", predictors="age", smooth="TRUE", xstop=0.50, probability="FALSE", intervention="TRUE")   #百人干预净获益减少曲线



##########################################################
#如何利用别人已经发表的模型，在我们自己数据验证DCA
#Use the coefficients from the Brown model 
logodds_Brown = 0.75*(famhistory)+0.26*(age)-17.5 
#Convert to predicted probability 
data.set$phat_Brown = exp(logodds_Brown)/(1+exp(logodds_Brown)) 
#Run the decision curve 
dca(data=data.set, outcome="cancer", predictors="phat_Brown", xstop=0.35) 
###########################################################





#######################################################
##################模型可视化###########################
###################Nomogram############################


##############################################
##############方法一：普通列线图##############
library(readr)
mydata <- read_csv("hypoglycemia_end.csv")

#删除有缺失值的行
mydata<-na.omit(mydata)

###
mydata$course_of_disease<-factor(mydata$course_of_disease,
                                 levels = c(0,1),
                                 labels=c("<10 years",">=10years"))

mydata$Hyperlipidemia<-factor(mydata$Hyperlipidemia,
                              levels = c(0,1),
                              labels=c("No","Yes"))

mydata$Treat_Time<-factor(mydata$Treat_Time,
                          levels = c(0,1),
                          labels=c("<30days",">=30days"))

mydata$Education<-factor(mydata$Education,
                         levels = c(1,2,3),
                         labels=c("<primary","middle school",">college"))


mydata$BUN<-factor(mydata$BUN,
                   levels = c(0,1),
                   labels=c("No","Yes"))

mydata$RBG<-factor(mydata$RBG,
                   levels = c(0,1),
                   labels=c("No","Yes"))

mydata$TC<-factor(mydata$TC,
                  levels = c(0,1),
                  labels=c("No","Yes"))
##---------------------------------------------



#数据集指定（如果数据集中已经有数据集分组变量，可以如下指定即可）
dev = mydata[mydata$dataset==1,]
vad = mydata[mydata$dataset==0,]


library(rms)

#打包数据

ddist <- datadist(dev)
options(datadist='ddist')

#构建2个回归模型，注意nomo要用lrm构建模型

modelA2 <- lrm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                 Education + gender + BUN + RBG + TC,data=dev)


modelB2 <- lrm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                 Education + BUN + RBG + TC,data=dev)
#设置列线图参数
#第一行modelA就是刚才logistic回归的模型名称。lp选择True或False，是否显示线性预测坐标（linear predictor），fun是要自己设一个函数，对lp进行转换，并建立一个新坐标轴。此处就用logit变换的反函数，将lp转换为我们熟悉的风险概率-。function(x) 1/(1+exp(-x))这一串，即使用function()构建一个自定义函数，括号中的x从lp的范围中取值，代入1/(1+exp(-x))中运算。
#fun.at则是给新的坐标轴设置范围。funlabel则是给上面转换好的新坐标轴起个名字，Diagnostic possibility。其实有了这条坐标轴，上面lp那里也可以设为F，不显示了。

nomomodelA <- nomogram(modelA2,lp=F, 
                       fun=function(x)1/(1+exp(-x)),
                       fun.at=seq(0.1,1,by=0.1),
                       funlabel="Diagnostic possibility")

nomomodelB <- nomogram(modelB2,lp=F, 
                       fun=function(x)1/(1+exp(-x)),
                       fun.at=seq(0.1,1,by=0.1),
                       funlabel="Diagnostic possibility")

#方法一:绘制普通列线图

plot(nomomodelA)
plot(nomomodelB)



#####################################################
################方法二：交互列线图###################

#方法二：绘制交互式列线图安装程序包，必须用glm函数
#install.packages("regplot")
library(regplot)


modelC <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                Education + BUN + RBG + TC, data = dev, family = binomial(link="logit"))


regplot(modelC,observation=dev[5,]) #改变前面数字，即可以知道数据框中第几个人的预测概率

####如下为细节加强版
regplot(modelC,#模型名称
        observation=dev[3,],#改变前面数字，即可以知道数据框中第几个人的预测概率
        center=TRUE,       #将每个变量设置不从0开始
        title = "Nomogram",#设置标题
        points = TRUE,     #point最大刻度设置成100
        odds = FALSE,      #设置是否显示OR
        showP = TRUE,      #显示变量是否存在统计学意义
        rank = "sd",       #按照回归系数的SD进行变量排序
        clickable = TRUE) #是否可以点击进行交互#TRUE体验交互




##########################################################
##############方法三：普通列线图变种（细节）##############
#方法三，普通列线图变种，可以设置上下刻度
dd=datadist(dev)
options(datadist="dd")

fit<-lrm(formula,data=dev)
nomo<-nomogram(fit,#模型名称
               fun=function(x)1/(1+exp(-x)),#固定格式
               lp=T,
               fun.at = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9),#设置坐标轴刻度
               conf.int = c(0.1,0.7),
               funlabel = "risk")
#tiff(filename="nomo.tif,width=600,height=700)
plot(nomo,
     lplabel='Linear Predictor',#设置线性概率坐标轴名称
     fun.side =c(1,3,1,3,1,3,1,3,1),#坐标轴刻度位置
     col.grid=c("blue","yellow"),#垂直参考线的颜色
     col.conf=c('red','green'),#设置置信区间的颜色
     conf.space=c(0.1,0.5))#设置置信区间条在两条坐标轴之间的位置
#图形需要调整大小，让文字显示错开
#dev.off()  清空图

plot(nomo,
     lplabel='Linear Predictor',#设置线性概率坐标轴名称
     fun.side =c(1,3,1,3,1,3,1,3,1),#坐标轴刻度位置
     col.grid=c("blue","yellow"),#垂直参考线的颜色
     #col.conf=c('red','green'),#设置置信区间的颜色
     conf.space=c(0.1,0.5))#设置置信区间条在两条坐标轴之间的位置

####################################################
################方法四：动态列线图##################
#方法四：动态列线图(测试成功)

library(readr)
mydata <- read_csv("hypoglycemia_end.csv")

#删除有缺失值的行
mydata<-na.omit(mydata)

#数据集指定（如果数据集中已经有数据集分组变量，可以如下指定即可）
dev = mydata[mydata$dataset==1,]
#vad = mydata[mydata$dataset==0,]


#install.packages("DynNom")

#install.packages("shiny")
ddist <- datadist(dev)
options(datadist='ddist')



dev<-na.omit(dev)
dev<-as.data.frame(dev)

library(shiny)
library(DynNom)
library(magrittr)
modelD<-glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
              Education + gender + BUN + RBG + TC,data=dev,family = binomial)

DynNom(modelD,DNtitle="Nomogram",DNxlab="probability",data = dev)


#######################################################################
#NRI如下成功测试
#程序包准备，需要先安装install.packages("nricens")程序包
#install.packages("nricens")
library(survival)
library(nricens)
library(rms)
library(foreign)

#数据准备
setwd("D:/R work")


#构建两个回归模型

devmodelA <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time 
                 ,data=dev, family = binomial(link="logit"),x=TRUE)

devmodelB <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                   Education + gender + BUN + RBG + TC,data=dev, 
                 family = binomial(link="logit"),x=TRUE)


#计算NRI

#计算连续的NRI取值为5%就定义为重分类了。
library(nricens)

NRI <- nribin(mdl.std = devmodelA, mdl.new = devmodelB,updown = 'diff',cut = 0.05,niter = 500,alpha = 0.05)
#注意，如果作图区域不够大，不出图，并且显示红色警告

#计算分类变量计算的NRI,只有概率超过0.48就定义为重分组了，0.48是根据之前模型C的ROC分析确定的切点

NRI <- nribin(mdl.std = devmodelA, mdl.new = devmodelB,
              updown = 'category',cut = 0.48,niter = 500,alpha = 0.05)


######################IDI##################################3
#程序包准备，需要先安装install.packages("PredictABEL")程序包
#数据准备
setwd("D:/R work")

#install.packages("PredictABEL")
library(PredictABEL)
library(nricens)
library(rms)
library(foreign)

library(readr)

mydata <- read_csv("hypoglycemia_end.csv")


dev = mydata[mydata$dataset==1,]
vad = mydata[mydata$dataset==0,]


modelA <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time 
              ,data=dev, family = binomial(link="logit"),x=TRUE)

dev$predmodelA<- predict(newdata=dev,modelA,"response")


modelB <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                Education + gender + BUN + RBG + TC,data=dev, 
              family = binomial(link="logit"),x=TRUE)

dev$predmodelB<- predict(newdata=dev,modelB,"response")


#模型B是新模型、模型A是旧模型
pstd <- modelA$fitted.values

pnew <- modelB$fitted.values


#计算IDI
dev<-as.matrix(dev)                         #必须如此，否则报错
reclassification(data = dev,cOutcome = 3,   #cOutcome的数字是指结局变量在那一列
                 predrisk1 = pstd, 
                 predrisk2 = pnew,
                 cutoff = c(0,0.48,1))

??IDI


##########LASSO#################################################
#利用R软件进行Lasso回归筛选预测因子（如下LASSO代码测试成功没问题）
################################################################
#install.packages("glmnet")

library(glmnet)
library(Matrix)
library(rms)
library(foreign)
library(readr)
setwd("D:/R work")

mydata <- read_csv("hypoglycemia_end.csv")

mydata<-na.omit(mydata)
#将结果变量中缺失数据删除，读者根据自己数据特点决定是否需要此命令。

dev = mydata[mydata$dataset==1,]
#vad = mydata[mydata$dataset==0,]

#write.csv(dev, "dev.csv")


#install.packages("psych")

library(psych)

describe(dev)
str(dev)

#将数据转换成因子变量,二分类变量可转可不转，注意对于lasso，无序多分类需要手动设置哑变量
#names(dev)
#for (i in names(dev)[c(4:6,8)]){dev[,i] <- as.factor(dev[,i])}
#str(dev)
View(dev)


############################封箱####################################
#无序多变量处理
#dev$marital_status<-factor(dev$marital_status,
#                           levels=c(1,2,3,4),
#                           labels=c("married", "unmarried","divorce","widow"))

#线面将在第41-44列产生四个变量
dev$marital_status1<-ifelse(dev$marital_status=='married',1,0)
dev$marital_status2<-ifelse(dev$marital_status=='unmarried',1,0)
dev$marital_status3<-ifelse(dev$marital_status=='divorce',1,0)
dev$marital_status4<-ifelse(dev$marital_status=='widow',1,0)

View(dev)
############################封箱#####################################
#批量数值转因子处理，如需
#for(i in names(dev)[c(4:40)]){
#  dev[,i]<-as.factor(dev[,i])
#}


#因子的处理

x <- as.matrix(data.frame(dev[,c(4:6,8:44)]))

#write.csv(x,file='xdata.csv')
y <- as.matrix(dev[,3])

fit<-glmnet(x,y,alpha=1,family='binomial')

plot(fit)


plot(fit, xvar = "lambda", label = TRUE)
plot(fit, xvar = "lambda", label = FALSE)

#abline(v=log(c(cv.fit$lambda.min,cv.fit$lambda.1se)),lty=2)  #运行CVlasso后可以运行
print(fit)


#可以直接用Lasso模型进行预测
dev$lassopred <-predict(fit,type="response",newx=x[1:685,],s=0.000572)  #x[1:685,]685代表咱dev中的样本量或记录数

View(dev)


#####################################################################
#############CVlasso交叉验证############目的：(1)筛选变量，找到最优模型，(2)利用最优模型预测概率
set.seed(123)
cv.fit <- cv.glmnet(x,y,alpha=1,nfolds = 10)
plot(cv.fit)
abline(v=log(c(cv.fit$lambda.min,cv.fit$lambda.1se)),lty=2)

#?cv.glmnet
#?glmnet
#如果取最小值时
cv.fit$lambda.min
Coefficients <- coef(fit, s = cv.fit$lambda.min)
Active.Index <- which(Coefficients != 0)
Active.Coefficients <- Coefficients[Active.Index]
Active.Index
Active.Coefficients
row.names(Coefficients)[Active.Index]
dev$minlassopred <-predict(fit,type="response",newx=x[1:685,],s=cv.fit$lambda.min)  #x[1:685,]685代表咱dev中的样本量或记录数

#如果取1倍标准误
cv.fit$lambda.1se
Coefficients <- coef(fit, s = cv.fit$lambda.1se)
Active.Index <- which(Coefficients != 0)
Active.Coefficients <- Coefficients[Active.Index]
Active.Index
Active.Coefficients
row.names(Coefficients)[Active.Index]
dev$selassopred <-predict(fit,type="response",newx=x[1:685,],s=cv.fit$lambda.1se)


########################Other important#######################################
#交叉验证与Bootstrap验证


getwd()
setwd("D:/R work")

library(readr)
mydata <- read_csv("hypoglycemia_end.csv")

#删除有缺失值的行
mydata<-na.omit(mydata)
View(mydata)

names(mydata)


#必须做，了解变量类型非常重要
str(mydata)

mydata$hypoglycemia<-factor(mydata$hypoglycemia,levels=c(0,1),labels=c("no","yes"))
mydata$ALT<-factor(mydata$ALT,levels=c(0,1),labels=c("no","yes"))
dev = mydata[mydata$dataset==1,]
vad = mydata[mydata$dataset==0,]



#install.packages("caret")
library(caret)
library(ggplot2)
library(lattice)


formula<-as.formula(hypoglycemia ~ course_of_disease + Hyperlipidemia + Treat_Time + 
                      Education + gender + BUN + RBG + TC)
formula



#1.交叉验证（简单交叉验证）
#p=0.7表示70%作为训练集，30%做为验证集，number=1是指迭代次数
train.control_1<-trainControl(method="LGOCV",p=0.7,number = 1)


dev$hypoglycemia<-factor(dev$hypoglycemia)


set.seed(123)

m1<-train(formula,data=dev,trControl=train.control_1,method="glm")

m1




#2.10折交叉验证CV

train.control_2<-trainControl(method = "CV",number = 10)

set.seed(123)

m2<-train(formula,data=dev,trControl=train.control_2,method="glm")

m2



#3.留一法LOOCV
train.control_3<-trainControl(method = "LOOCV")
m3<-train(formula,data=dev,trControl=train.control_3,method="glm")

m3



#Boot
train.control_4<-trainControl(method = "boot",
                              number = 50)
set.seed(123)

m4<-train(formula,data=dev,trControl=train.control_4,method="glm")
m4



#5.Bootstrap crossvalidation

train.control_5<-trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 50)

set.seed(123)

m5<-train(formula,data=dev,trControl=train.control_5,method="glm")
m5



#展示Bootstrap的ROC
train.control_6<-trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 50,
                              classProbs = TRUE,
                              summaryFunction = twoClassSummary)

set.seed(123)
m6<-train(formula,data=dev,trControl=train.control_6,method="glm")
m6
as.matrix(m6$results[2])


#--------------------------------------------------------------------------
#构建最终模型后，几种验证方法
#1.模型预测概率及各模型因子ROC曲线
library(readr)
mydata <- read_csv("hypoglycemia_end.csv")
#构建最终模型
modelB <- glm(hypoglycemia ~course_of_disease + Hyperlipidemia + Treat_Time + 
                Education + gender + BUN + RBG + TC,data=dev, 
              family = binomial(link="logit"),x=TRUE)


#生成预测概率
dev$predmodelB<- predict(newdata=dev,modelB,"response")


#ROC找截点------------
library(pROC)

#在建模人群中绘制分别二条ROC曲线并给出阈值和ROC曲线下面积。
#建模集，model8的auc与roc分析
devmodelA <- roc(hypoglycemia~predmodelB, data = dev,smooth=F)
devmodelA


#ROC画图方法一(灵敏度与特异度ROC，非标准；但会给出AUC，阈值及对应的灵敏度和特异度)
plot(devmodelA, print.auc=TRUE, print.thres=TRUE,main = "ROC CURVE", col= "blue",print.thres.col="blue",identity.col="blue",
     identity.lty=1,identity.lwd=1)

##添加其他因子的ROC曲线一起比较
devroc1 <- plot.roc(dev$hypoglycemia, dev$predmodelB, main="dev ROC", percent=TRUE, col="1")
devroc2 <- lines.roc(dev$hypoglycemia, dev$TC, percent=TRUE, col="2")
devroc3 <- lines.roc(dev$hypoglycemia, dev$RBG, percent=TRUE, col="3")

legend("bottomright", legend=c("devmodelB", "TC","RBG"), col=c("1", "2","3"), lwd=2)


####This is the end########################################################


